#!/usr/bin/python
'''
Sees how many of the denovo repeats don't overlap with known repeats.
rep_lib is the repeat modeler library.
hit_gff is the gff file from correlate_gff_gff.py
'''
import sys
import fasta_read

# parameters.
rep_lib = sys.argv[1]
hit_gff = sys.argv[2]

# read in repeat library.
total_reads = fasta_read.read_fasta_dict(rep_lib)

# pull out names.
total_names = set()
for n in total_reads:
	tmp = n.split("#")[0]
	total_names.add(tmp)


# read gff.
fin = open(hit_gff, "rb")
lines = fin.readlines()
fin.close()

# parse gff.
ant_names = set()
for line in lines:
	# tokenize.
	if line.count("rnd") == 0: continue
	tmp = line.strip().split("\t")
	grp = tmp[8]
	
	tmp2 = grp.split(";")
	tmp3 = tmp2[0].split(":")[1]
	rnd = tmp3.split('"')[0]
	
	# add to set.
	ant_names.add(rnd)
	
# See diff.
denovo_names = total_names.difference(ant_names)

# print names of unknown.
#for x in denovo_names:
#	print x
#sys.exit()

# print anot names.
for x in ant_names:
	print x	
